from pptx import Presentation
from langchain_core.documents import Document

# 解析 PPT 文件
def parse_ppt(filename):
    presentation = Presentation(filename)
    elements = []
    for slide in presentation.slides:
        for shape in slide.shapes:
            try:
                text = shape.text
                if text.strip():
                    elements.append(text)
            except AttributeError:
                continue
    return elements

# 解析 PPT 文件
ppt_elements = parse_ppt("D:/ideaSpace/rag-in-action-master/90-文档-Data/黑悟空/黑神话悟空.pptx")
print("PPT 内容：")

# 转换为 Documents 数据结构
documents = [
    Document(page_content=element, metadata={"source": "data/黑神话悟空PPT.pptx"})
    for element in ppt_elements
]

# 输出转换后的 Documents
print(documents[0:3])